{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from collections import defaultdict\n",
    "import math\n",
    "import operator\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明:创建数据样本\n",
    "Returns:\n",
    "    dataset - 实验样本切分的词条\n",
    "    classVec - 类别标签向量\n",
    "\"\"\"\n",
    "def loadDataSet():\n",
    "    dataset = [ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],    # 切分的词条\n",
    "                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n",
    "                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n",
    "                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n",
    "                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n",
    "                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'] ]\n",
    "    classVec = [0, 1, 0, 1, 0, 1]  # 类别标签向量，1代表好，0代表不好\n",
    "    return dataset, classVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "函数说明：特征选择TF-IDF算法\n",
    "Parameters:\n",
    "     list_words:词列表\n",
    "Returns:\n",
    "     dict_feature_select:特征选择词字典\n",
    "\"\"\"\n",
    "def feature_select(list_words):\n",
    "    #总词频统计\n",
    "    doc_frequency=defaultdict(int)\n",
    "    for word_list in list_words:\n",
    "        for i in word_list:\n",
    "            doc_frequency[i]+=1\n",
    "#     print('doc_frequency：',doc_frequency)\n",
    "    print('doc_frequency的长度：',len(doc_frequency))\n",
    "    #计算每个词的TF值\n",
    "    sum_doc_frequency=0\n",
    "    for i in doc_frequency:\n",
    "        sum_doc_frequency=sum_doc_frequency+doc_frequency[i]\n",
    "    print('sum_doc_frequency',sum_doc_frequency)\n",
    "    word_tf={}  #存储每个词的tf值\n",
    "    for i in doc_frequency:\n",
    "        word_tf[i]=doc_frequency[i]/sum_doc_frequency\n",
    "    #计算每个词的IDF值\n",
    "    doc_num=len(list_words)\n",
    "    word_idf={} #存储每个词的idf值\n",
    "    word_doc=defaultdict(int) #存储包含该词的文档数\n",
    "    for i in doc_frequency:\n",
    "        for j in list_words:\n",
    "            if i in j:\n",
    "                word_doc[i]+=1\n",
    "    for i in doc_frequency:\n",
    "        word_idf[i]=math.log(doc_num/(word_doc[i]+1))\n",
    " \n",
    "    #计算每个词的TF*IDF的值\n",
    "    word_tf_idf={}\n",
    "    for i in doc_frequency:\n",
    "        word_tf_idf[i]=word_tf[i]*word_idf[i]\n",
    "    # 对字典按值由大到小排序\n",
    "    dict_feature_select=sorted(word_tf_idf.items(),key=operator.itemgetter(1),reverse=True)\n",
    "    return dict_feature_select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_values([3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for /: 'int' and 'dict_values'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-90-8b1ffee80817>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlabel_list\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mloadDataSet\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m#加载数据\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mfeatures\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfeature_select\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_list\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m#所有词的TF-IDF值\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-89-1d7e48c96844>\u001b[0m in \u001b[0;36mfeature_select\u001b[1;34m(list_words)\u001b[0m\n\u001b[0;32m     24\u001b[0m     \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc_frequency\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     25\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mdoc_frequency\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m         \u001b[0mword_tf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdoc_frequency\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc_frequency\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     27\u001b[0m     \u001b[1;31m#计算每个词的IDF值\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     28\u001b[0m     \u001b[0mdoc_num\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlist_words\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'int' and 'dict_values'"
     ]
    }
   ],
   "source": [
    "data_list,label_list=loadDataSet() #加载数据\n",
    "features=feature_select(data_list) #所有词的TF-IDF值\n",
    "print(features)\n",
    "print(len(features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('to', 0.0322394037469742),\n",
       " ('stop', 0.0322394037469742),\n",
       " ('worthless', 0.0322394037469742),\n",
       " ('my', 0.028288263356383563),\n",
       " ('dog', 0.028288263356383563),\n",
       " ('him', 0.028288263356383563),\n",
       " ('stupid', 0.028288263356383563),\n",
       " ('has', 0.025549122992281622),\n",
       " ('flea', 0.025549122992281622),\n",
       " ('problems', 0.025549122992281622),\n",
       " ('help', 0.025549122992281622),\n",
       " ('please', 0.025549122992281622),\n",
       " ('maybe', 0.025549122992281622),\n",
       " ('not', 0.025549122992281622),\n",
       " ('take', 0.025549122992281622),\n",
       " ('park', 0.025549122992281622),\n",
       " ('dalmation', 0.025549122992281622),\n",
       " ('is', 0.025549122992281622),\n",
       " ('so', 0.025549122992281622),\n",
       " ('cute', 0.025549122992281622),\n",
       " ('I', 0.025549122992281622),\n",
       " ('love', 0.025549122992281622),\n",
       " ('posting', 0.025549122992281622),\n",
       " ('garbage', 0.025549122992281622),\n",
       " ('mr', 0.025549122992281622),\n",
       " ('licks', 0.025549122992281622),\n",
       " ('ate', 0.025549122992281622),\n",
       " ('steak', 0.025549122992281622),\n",
       " ('how', 0.025549122992281622),\n",
       " ('quit', 0.025549122992281622),\n",
       " ('buying', 0.025549122992281622),\n",
       " ('food', 0.025549122992281622)]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "输出x_train文本向量：\n",
      "[[0.70710678 0.         0.70710678 0.         0.         0.\n",
      "  0.         0.         0.         0.        ]\n",
      " [0.         0.3349067  0.         0.44036207 0.         0.44036207\n",
      "  0.44036207 0.44036207 0.         0.3349067 ]\n",
      " [0.         0.22769009 0.         0.         0.89815533 0.\n",
      "  0.         0.         0.29938511 0.22769009]]\n",
      "输出x_test文本向量：\n",
      "[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    " \n",
    "x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\n",
    "           '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\n",
    "x_test=['原始 文本 进行 标记','主要 思想']\n",
    " \n",
    "#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频\n",
    "vectorizer = CountVectorizer(max_features=10)\n",
    "#该类会统计每个词语的tf-idf权值\n",
    "tf_idf_transformer = TfidfTransformer()\n",
    "#将文本转为词频矩阵并计算tf-idf\n",
    "tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\n",
    "#将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重\n",
    "x_train_weight = tf_idf.toarray()\n",
    " \n",
    "#对测试集进行tf-idf权重计算\n",
    "tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\n",
    "x_test_weight = tf_idf.toarray()  # 测试集TF-IDF权重矩阵\n",
    " \n",
    "print('输出x_train文本向量：')\n",
    "print(x_train_weight)\n",
    "print('输出x_test文本向量：')\n",
    "print(x_test_weight)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import *\n",
    "import re\n",
    "import operator\n",
    "import math\n",
    "from collections import defaultdict\n",
    "import operator\n",
    "from os import listdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    " # 创建一个包含在所有文档中出现的不重复的列表\n",
    "def create_vocab_list(data_set):\n",
    "\n",
    "    # 创建一个空集\n",
    "    vocab_set = set([])\n",
    "\n",
    "    for document in data_set:\n",
    "        vocab_set = vocab_set | set(document)\n",
    "    # 创建两个集合的并集\n",
    "    return list(vocab_set)\n",
    "\n",
    "# 切割分类文本\n",
    "def text_parse(big_string):\n",
    "    regEx = re.compile('\\\\W')  \n",
    "    list_of_tokens = regEx.split(big_string)\n",
    "#     list_of_tokens = re.split('\\W+', big_string)\n",
    "    #如果单词长度太短，就忽略该单词\n",
    "    return [tok.lower() for tok in list_of_tokens if len(tok) > 2]\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def feature_select(list_words):\n",
    "    #总词频统计\n",
    "    doc_frequency=defaultdict(int)\n",
    "    for word_list in list_words:\n",
    "        for i in word_list:\n",
    "            doc_frequency[i]+=1\n",
    "#     print('doc_frequency：',doc_frequency)\n",
    "    print('doc_frequency的长度：',len(doc_frequency))\n",
    "    #计算每个词的TF值\n",
    "    sum_doc_frequency=0\n",
    "    for i in doc_frequency:\n",
    "        sum_doc_frequency=sum_doc_frequency+doc_frequency[i]\n",
    "    print('sum_doc_frequency',sum_doc_frequency)\n",
    "    word_tf={}  #存储每个词的tf值\n",
    "    for i in doc_frequency:\n",
    "        word_tf[i]=doc_frequency[i]/sum_doc_frequency\n",
    "    #计算每个词的IDF值\n",
    "    doc_num=len(list_words)\n",
    "    word_idf={} #存储每个词的idf值\n",
    "    word_doc=defaultdict(int) #存储包含该词的文档数\n",
    "    for i in doc_frequency:\n",
    "        for j in list_words:\n",
    "            if i in j:\n",
    "                word_doc[i]+=1\n",
    "    for i in doc_frequency:\n",
    "        word_idf[i]=math.log(doc_num/(word_doc[i]+1))\n",
    "    #计算每个词的TF*IDF的值\n",
    "    word_tf_idf={}\n",
    "    for i in doc_frequency:\n",
    "        word_tf_idf[i]=word_tf[i]*word_idf[i]\n",
    "    # 对字典按值由大到小排序\n",
    "    dict_feature_select=sorted(word_tf_idf.items(),key=operator.itemgetter(1),reverse=True)\n",
    "    return dict_feature_select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "mylist=listdir('mini_newsgroups/')\n",
    "doc_list=[]\n",
    "dircontent=[]\n",
    "dircontent_sencond=[]\n",
    "for i in mylist: \n",
    "    if 'A'<=i[0]<='D':\n",
    "        dircontent.append(i)\n",
    "for i in dircontent:\n",
    "    dircontent_sencond=listdir('mini_newsgroups/%s/'%i)\n",
    "    for j in dircontent_sencond:\n",
    "        word_list = text_parse(str(open('mini_newsgroups/%s/%s'%(i,j),'rb').read()))\n",
    "        doc_list.append(word_list)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "doc_frequency的长度： 49057\n",
      "sum_doc_frequency 479128\n"
     ]
    }
   ],
   "source": [
    "last_result=feature_select(doc_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('dos', 0.004219933824405356),\n",
       " ('you', 0.0037607257491510055),\n",
       " ('they', 0.0034970509266022236),\n",
       " ('windows', 0.003468468480291792),\n",
       " ('com', 0.0034431764831130713),\n",
       " ('that', 0.0033358278784902944),\n",
       " ('was', 0.003169219663629844),\n",
       " ('are', 0.0030681457960167967),\n",
       " ('will', 0.00291519985571811),\n",
       " ('talk', 0.0028955005002085053),\n",
       " ('t25', 0.002860082074460113),\n",
       " ('comp', 0.002849348956160552),\n",
       " ('not', 0.0028457311960635636),\n",
       " ('with', 0.002844448803743778),\n",
       " ('have', 0.002831126180055342),\n",
       " ('this', 0.0027786031292295005),\n",
       " ('and', 0.0027619805321936167),\n",
       " ('misc', 0.0026730753076809484),\n",
       " ('your', 0.0026577474686271907),\n",
       " ('would', 0.0026366698125250623),\n",
       " ('can', 0.002588939913597519),\n",
       " ('from', 0.002532199070341917),\n",
       " ('for', 0.0025023606153771567),\n",
       " ('alt', 0.002465060774381609),\n",
       " ('politics', 0.002451850200083493),\n",
       " ('what', 0.002445283770021167),\n",
       " ('sci', 0.002442155328828527),\n",
       " ('were', 0.0024298027403287232),\n",
       " ('soc', 0.002419008199536343),\n",
       " ('one', 0.0023870351352932445),\n",
       " ('space', 0.0023498590672174045),\n",
       " ('but', 0.002345553288589892),\n",
       " ('there', 0.002274035078530535),\n",
       " ('their', 0.0022597088395253807),\n",
       " ('nthe', 0.002240314786300671),\n",
       " ('the', 0.002238409815057916),\n",
       " ('state', 0.0022372640655314103),\n",
       " ('some', 0.002229200690777568),\n",
       " ('people', 0.0022186772730821247),\n",
       " ('who', 0.0022180373270449557),\n",
       " ('all', 0.002213228454349077),\n",
       " ('culture', 0.002209011584394478),\n",
       " ('has', 0.0021942831745425983),\n",
       " ('more', 0.0021893861544095145),\n",
       " ('uiuc', 0.0021866814298882645),\n",
       " ('don', 0.002176871099235366),\n",
       " ('ohio', 0.0021685134668599144),\n",
       " ('org', 0.0021652745102915217),\n",
       " ('about', 0.002149981406045662),\n",
       " ('stephanopoulos', 0.0021485494608139383),\n",
       " ('which', 0.002134524910918735),\n",
       " ('any', 0.002103199072488524),\n",
       " ('use', 0.002093556729976702),\n",
       " ('than', 0.002068712543073955),\n",
       " ('145', 0.0020678928126606727),\n",
       " ('when', 0.002062840682597679),\n",
       " ('had', 0.0020388178531451405),\n",
       " ('his', 0.002019508383686839),\n",
       " ('out', 0.002002840231430883),\n",
       " ('other', 0.001981797327577438),\n",
       " ('should', 0.001942961809304984),\n",
       " ('system', 0.001939139356709526),\n",
       " ('netcom', 0.0019379864879669052),\n",
       " ('them', 0.0019225507706991888),\n",
       " ('ibm', 0.0019207154707783398),\n",
       " ('new', 0.0019177688507318468),\n",
       " ('think', 0.0018922636793767187),\n",
       " ('been', 0.0018884134708801248),\n",
       " ('just', 0.0018767638628576745),\n",
       " ('know', 0.0018644508015480362),\n",
       " ('get', 0.0018582647175472073),\n",
       " ('only', 0.001841932604502246),\n",
       " ('gov', 0.0018413087557060837),\n",
       " ('how', 0.0018402880186990337),\n",
       " ('like', 0.0018371900030873922),\n",
       " ('nasa', 0.001819995173331644),\n",
       " ('may', 0.0018100753224040908),\n",
       " ('also', 0.001786815490368039),\n",
       " ('usenet', 0.001753952978071215),\n",
       " ('a86', 0.001730000864551483),\n",
       " ('two', 0.0017178176514256996),\n",
       " ('these', 0.0016733664700292511),\n",
       " ('rec', 0.0016671856961763705),\n",
       " ('because', 0.0016575914231613858),\n",
       " ('does', 0.001650978113309233),\n",
       " ('then', 0.0016503393063871206),\n",
       " ('cso', 0.0016497877731677256),\n",
       " ('article', 0.0016454133127841955),\n",
       " ('time', 0.0016279825552943713),\n",
       " ('well', 0.001605327448956631),\n",
       " ('most', 0.001604213672047599),\n",
       " ('did', 0.0016040443508693093),\n",
       " ('very', 0.0015845007479517628),\n",
       " ('into', 0.0015724898873872573),\n",
       " ('see', 0.0015695820961212593),\n",
       " ('writes', 0.0015674817799180632),\n",
       " ('good', 0.0015601633761320331),\n",
       " ('make', 0.0015492363809311196),\n",
       " ('sun', 0.0015358774830682995),\n",
       " ('why', 0.0015258155350646604),\n",
       " ('even', 0.0015240299895419903),\n",
       " ('could', 0.0015205734534540637),\n",
       " ('power', 0.0015138563745380291),\n",
       " ('drive', 0.0015056405338091418),\n",
       " ('university', 0.0015023907092505627),\n",
       " ('mit', 0.0014956366139212895),\n",
       " ('government', 0.0014939542256905709),\n",
       " ('world', 0.0014864193281297403),\n",
       " ('way', 0.0014746440273967916),\n",
       " ('used', 0.0014713479687254254),\n",
       " ('said', 0.001469883130690464),\n",
       " ('mac', 0.0014650832383743366),\n",
       " ('many', 0.0014642364386624322),\n",
       " ('problem', 0.001461375836424699),\n",
       " ('access', 0.0014576355140487308),\n",
       " ('much', 0.0014504200034768679),\n",
       " ('religion', 0.0014490766525882268),\n",
       " ('harvard', 0.0014313135388140453),\n",
       " ('sys', 0.0014297397942396506),\n",
       " ('say', 0.0014264163039685869),\n",
       " ('eng', 0.00142614634192302),\n",
       " ('apple', 0.001424257557212391),\n",
       " ('6um', 0.0014230652272923488),\n",
       " ('hardware', 0.0014065019431267834),\n",
       " ('its', 0.0014042874726225263),\n",
       " ('first', 0.0014025510949719328),\n",
       " ('work', 0.001402106354093257),\n",
       " ('net', 0.001394045469998968),\n",
       " ('key', 0.0013917237790575445),\n",
       " ('those', 0.0013897424088005158),\n",
       " ('turkish', 0.0013860299855914804),\n",
       " ('nand', 0.0013806932806587192),\n",
       " ('need', 0.0013789963089290236),\n",
       " ('such', 0.0013766351733107494),\n",
       " ('going', 0.001373184297960818),\n",
       " ('now', 0.0013731772399458088),\n",
       " ('gun', 0.001371544493115593),\n",
       " ('right', 0.0013687027373673836),\n",
       " ('after', 0.0013668555031044268),\n",
       " ('nin', 0.001366406141697444),\n",
       " ('off', 0.0013572723054224923),\n",
       " ('our', 0.0013559433698663418),\n",
       " ('guns', 0.001355794787668849),\n",
       " ('mps', 0.001350514934245289),\n",
       " ('software', 0.0013504570129142471),\n",
       " ('year', 0.0013477500149507622),\n",
       " ('rochester', 0.0013379878137007948),\n",
       " ('andrew', 0.00132769058897536),\n",
       " ('microsoft', 0.0013171980892292172),\n",
       " ('program', 0.0013143089046661163),\n",
       " ('information', 0.0013138957643586192),\n",
       " ('data', 0.001312122526944564),\n",
       " ('same', 0.0013098884970351953),\n",
       " ('where', 0.0013024407807521918),\n",
       " ('want', 0.0012961856116960957),\n",
       " ('here', 0.0012888348575101787),\n",
       " ('cis', 0.001288529216663665),\n",
       " ('uunet', 0.0012834175423282919),\n",
       " ('him', 0.001278883217446137),\n",
       " ('must', 0.0012705753201013713),\n",
       " ('stratus', 0.0012698358550628532),\n",
       " ('nto', 0.0012693902686274272),\n",
       " ('being', 0.0012671857973942378),\n",
       " ('really', 0.0012662897841810897),\n",
       " ('over', 0.0012655319456737197),\n",
       " ('help', 0.0012633919263173442),\n",
       " ('ece', 0.00125408756434575),\n",
       " ('too', 0.0012525820532031788),\n",
       " ('law', 0.00125194366151028),\n",
       " ('since', 0.0012515791799405057),\n",
       " ('near', 0.0012485243998970872),\n",
       " ('posting', 0.001243639531201511),\n",
       " ('host', 0.0012421996047899911),\n",
       " ('2di', 0.0012416941689119515),\n",
       " ('newsgroups', 0.0012388797209300596),\n",
       " ('das', 0.001238570322683675),\n",
       " ('path', 0.0012320100869523572),\n",
       " ('sei', 0.001229715840980379),\n",
       " ('years', 0.0012290847111377694),\n",
       " ('entry', 0.0012280294557985205),\n",
       " ('34u', 0.0012277425490365365),\n",
       " ('security', 0.0012269435296599753),\n",
       " ('better', 0.0012268739249211686),\n",
       " ('utexas', 0.0012232008305264868),\n",
       " ('xref', 0.0012185009758466177),\n",
       " ('zaphod', 0.0012181671038560998),\n",
       " ('nnntp', 0.0012118920379577815),\n",
       " ('noc', 0.001209510914802772),\n",
       " ('crabapple', 0.0012088771042792527),\n",
       " ('gatech', 0.0012068757717055025),\n",
       " ('mail', 0.0012049525526046643),\n",
       " ('back', 0.0012021976697857708),\n",
       " ('cwru', 0.001201892146239995),\n",
       " ('udel', 0.0011979189495544681),\n",
       " ('club', 0.0011956975111064676),\n",
       " ('anyone', 0.0011954833566029572),\n",
       " ('encryption', 0.001193196665732807),\n",
       " ('version', 0.0011881811296478443),\n",
       " ('made', 0.0011869658026408467),\n",
       " ('computer', 0.0011852437743095669),\n",
       " ('fs7', 0.0011822282187504962),\n",
       " ('president', 0.0011810151622395485),\n",
       " ('something', 0.0011793423792798641),\n",
       " ('game', 0.0011773013696296836),\n",
       " ('number', 0.001175943344522865),\n",
       " ('usa', 0.0011720198266981284),\n",
       " ('last', 0.0011704800894792348),\n",
       " ('non', 0.0011701420663167027),\n",
       " ('europa', 0.0011672411931589867),\n",
       " ('card', 0.0011664821615196398),\n",
       " ('using', 0.0011664601583477859),\n",
       " ('between', 0.0011662173751810223),\n",
       " ('gtefsd', 0.0011607798660613663),\n",
       " ('both', 0.0011604544022725607),\n",
       " ('own', 0.0011560425629869122),\n",
       " ('case', 0.0011523706904467566),\n",
       " ('uucp', 0.0011499276466882963),\n",
       " ('berkeley', 0.0011442504515357717),\n",
       " ('75u', 0.0011440328297840452),\n",
       " ('window', 0.0011428809684308928),\n",
       " ('might', 0.0011424834789454759),\n",
       " ('nsender', 0.0011419341560382835),\n",
       " ('ans', 0.0011415864188185308),\n",
       " ('magnesium', 0.0011398927574177266),\n",
       " ('howland', 0.0011379057390294462),\n",
       " ('wire', 0.0011373801605008484),\n",
       " ('reston', 0.0011359496078297527),\n",
       " ('through', 0.0011358044354646837),\n",
       " ('sura', 0.0011296903480767196),\n",
       " ('still', 0.0011273711218362994),\n",
       " ('ndistribution', 0.001123751554515025),\n",
       " ('sure', 0.0011228796034226092),\n",
       " ('6ei', 0.001116129590033215),\n",
       " ('nof', 0.0011137321245614227),\n",
       " ('find', 0.0011130838507566417),\n",
       " ('believe', 0.0011124936142627019),\n",
       " ('science', 0.0011113157003102162),\n",
       " ('run', 0.0011051671176591218),\n",
       " ('team', 0.0011022808180902453),\n",
       " ('while', 0.0010996858450328675),\n",
       " ('point', 0.0010887223831887124),\n",
       " ('before', 0.001085406487538871),\n",
       " ('wiring', 0.0010837085328382135),\n",
       " ('without', 0.0010817587035178737),\n",
       " ('tthe', 0.0010814120024264457),\n",
       " ('clipper', 0.0010794814852824628),\n",
       " ('nthat', 0.0010792138907591167),\n",
       " ('under', 0.001074002015261105),\n",
       " ('public', 0.00107074753496034),\n",
       " ('chip', 0.001067302869616772),\n",
       " ('question', 0.0010633197872825272),\n",
       " ('file', 0.0010563524506698183),\n",
       " ('autos', 0.0010551727839403044),\n",
       " ('colorado', 0.0010495339087340093),\n",
       " ('abortion', 0.0010492011991832435),\n",
       " ('columbia', 0.001044162081288333),\n",
       " ('med', 0.001042382924390368),\n",
       " ('bit', 0.0010406239173103775),\n",
       " ('look', 0.0010377458582022702),\n",
       " ('jewish', 0.0010363047117711687),\n",
       " ('sport', 0.0010353200369411474),\n",
       " ('inc', 0.0010352907686094022),\n",
       " ('list', 0.001033306912508987),\n",
       " ('never', 0.0010332432982286986),\n",
       " ('digex', 0.001032428126942448),\n",
       " ('take', 0.0010227225933949135),\n",
       " ('washington', 0.0010217080539501422),\n",
       " ('long', 0.0010208618943389786),\n",
       " ('least', 0.0010174086899667444),\n",
       " ('line', 0.0010163130262618426),\n",
       " ('health', 0.001010443713501651),\n",
       " ('down', 0.0010079001975371485),\n",
       " ('nit', 0.001007535956688025),\n",
       " ('baseball', 0.0010058066688617283),\n",
       " ('clinton', 0.0010047497309621197),\n",
       " ('doesn', 0.0009993033618864316),\n",
       " ('david', 0.0009972366632625361),\n",
       " ('dec', 0.0009967793135449287),\n",
       " ('come', 0.0009928261043559289),\n",
       " ('ins', 0.0009923880106568765),\n",
       " ('hockey', 0.0009917470974739474),\n",
       " ('part', 0.000991547014510482),\n",
       " ('israel', 0.0009909790675634955),\n",
       " ('things', 0.0009890253953758726),\n",
       " ('enough', 0.000987214184234772),\n",
       " ('att', 0.0009868170213905912),\n",
       " ('research', 0.0009857316784995871),\n",
       " ('fbi', 0.0009856554627325174),\n",
       " ('etc', 0.000984650218808065),\n",
       " ('1993apr20', 0.0009828444261319588),\n",
       " ('she', 0.0009828180169700763),\n",
       " ('1993', 0.0009803870269244723),\n",
       " ('each', 0.0009782452945023997),\n",
       " ('send', 0.0009773228920840226),\n",
       " ('s0g', 0.000976613391279063),\n",
       " ('got', 0.0009734113861296845),\n",
       " ('day', 0.000970535880433432),\n",
       " ('anything', 0.000970332914314316),\n",
       " ('every', 0.0009701520604106111),\n",
       " ('high', 0.0009669082487957935),\n",
       " ('again', 0.0009660511017254846),\n",
       " ('another', 0.0009628334325204365),\n",
       " ('nyou', 0.0009621480680433369),\n",
       " ('nec', 0.0009603128947603929),\n",
       " ('info', 0.0009564390012431355),\n",
       " ('put', 0.0009562075942973606),\n",
       " ('code', 0.0009557291946712024),\n",
       " ('fire', 0.0009554640803293496),\n",
       " ('set', 0.0009535256785547011),\n",
       " ('read', 0.0009530893943909444),\n",
       " ('nreply', 0.0009523778312082092),\n",
       " ('possible', 0.0009523657893568334),\n",
       " ('000', 0.0009511581878391102),\n",
       " ('nnewsgroups', 0.0009478945580186233),\n",
       " ('mideast', 0.0009460758063760405),\n",
       " ('john', 0.0009459484118981068),\n",
       " ('next', 0.0009444131467002395),\n",
       " ('tue', 0.000943097196762451),\n",
       " ('support', 0.0009430424236319405),\n",
       " ('group', 0.0009427686101240325),\n",
       " ('toronto', 0.0009427029958692566),\n",
       " ('news', 0.0009405561865088178),\n",
       " ('electronics', 0.0009396515740195739),\n",
       " ('files', 0.000939593451965025),\n",
       " ('umd', 0.0009389136239578142),\n",
       " ('little', 0.0009386917205527692),\n",
       " ('few', 0.0009385660075520787),\n",
       " ('far', 0.00093741503687736),\n",
       " ('available', 0.000937255343398161),\n",
       " ('usc', 0.0009367694235465167),\n",
       " ('best', 0.0009351517597926299),\n",
       " ('technology', 0.0009340873825045318),\n",
       " ('crypt', 0.0009340873825045318),\n",
       " ('please', 0.0009295672512295597),\n",
       " ('course', 0.0009282554935313867),\n",
       " ('around', 0.0009277509643322326),\n",
       " ('let', 0.0009262035727939804),\n",
       " ('didn', 0.0009244255668758958),\n",
       " ('problems', 0.0009241906494131275),\n",
       " ('virginia', 0.0009239998570004054),\n",
       " ('0el', 0.0009208069117774022),\n",
       " ('acs', 0.0009203928553683848),\n",
       " ('tcp', 0.0009190559059912811),\n",
       " ('jim', 0.0009187135974769133),\n",
       " ('nthis', 0.0009159447011362761),\n",
       " ('car', 0.00091426980227215),\n",
       " ('systems', 0.0009102946894888784),\n",
       " ('hard', 0.000907735050398292),\n",
       " ('give', 0.0009064971137983638),\n",
       " ('request', 0.0009053179573995688),\n",
       " ('against', 0.0009027069492615655),\n",
       " ('fact', 0.0009008606437444271),\n",
       " ('stanford', 0.0009008008217664811),\n",
       " ('national', 0.0008991780798975114),\n",
       " ('purdue', 0.0008982369978619135),\n",
       " ('unix', 0.0008981256242659965),\n",
       " ('thing', 0.0008942273758950066),\n",
       " ('launch', 0.000894061540848693),\n",
       " ('venus', 0.0008938493944143051),\n",
       " ('probably', 0.0008920006334710121),\n",
       " ('lot', 0.0008917998288783674),\n",
       " ('server', 0.0008886053760614614),\n",
       " ('real', 0.0008881404533932224),\n",
       " ('old', 0.000886332561396144),\n",
       " ('bill', 0.0008852996670248451),\n",
       " ('someone', 0.0008844659241821571),\n",
       " ('nfor', 0.0008822032957619098),\n",
       " ('agate', 0.000881978154443248),\n",
       " ('soviet', 0.0008818071258332633),\n",
       " ('win', 0.0008802420633584494),\n",
       " ('control', 0.0008795858560792113),\n",
       " ('pitt', 0.0008795732469883588),\n",
       " ('freenet', 0.0008794922805137485),\n",
       " ('npath', 0.000874954326698977),\n",
       " ('either', 0.0008749383973551377),\n",
       " ('nreferences', 0.0008735023880414385),\n",
       " ('tell', 0.0008719594139529686),\n",
       " ('bb3', 0.0008704139430289802),\n",
       " ('name', 0.0008690656964940539),\n",
       " ('ucs', 0.0008657951415988051),\n",
       " ('caltech', 0.00086480545807008),\n",
       " ('reason', 0.000863765635250273),\n",
       " ('legal', 0.0008611846399018418),\n",
       " ('great', 0.0008595875826624336),\n",
       " ('won', 0.0008576147101922512),\n",
       " ('post', 0.0008565980279682722),\n",
       " ('less', 0.0008473068327456108),\n",
       " ('play', 0.000846061610173241),\n",
       " ('called', 0.0008460385937602517),\n",
       " ('try', 0.0008437767354207197),\n",
       " ('darwin', 0.0008428692149789537),\n",
       " ('candida', 0.0008404270254663697),\n",
       " ('fri', 0.0008393251091836614),\n",
       " ('able', 0.0008383732002477037),\n",
       " ('0em', 0.000837097192524911),\n",
       " ('done', 0.0008355380001113752),\n",
       " ('having', 0.0008353986655660172),\n",
       " ('mark', 0.0008343563869705552),\n",
       " ('actually', 0.0008331659132896435),\n",
       " ('network', 0.0008325311932091563),\n",
       " ('different', 0.0008325311932091563),\n",
       " ('higher', 0.0008318437428452701),\n",
       " ('start', 0.0008297760013219385),\n",
       " ('order', 0.0008279434953982341),\n",
       " ('ever', 0.000827355086963274),\n",
       " ('however', 0.0008272437239697343),\n",
       " ('her', 0.0008263381110241702),\n",
       " ('amiga', 0.000825637902673743),\n",
       " ('mitre', 0.0008235074420331675),\n",
       " ('yale', 0.0008217184493789995),\n",
       " ('ground', 0.0008217113376434104),\n",
       " ('based', 0.0008208754765591503),\n",
       " ('seems', 0.0008205009447089745),\n",
       " ('indiana', 0.0008196518228795982),\n",
       " ('person', 0.0008188878696113624),\n",
       " ('place', 0.000817084244435265),\n",
       " ('second', 0.0008158480103062051),\n",
       " ('true', 0.0008157862069426744),\n",
       " ('nif', 0.0008156206675492977),\n",
       " ('mon', 0.0008147724039325183),\n",
       " ('optilink', 0.0008145805540304104),\n",
       " ('shuttle', 0.0008145805540304104),\n",
       " ('michael', 0.0008118840026856327),\n",
       " ('1993apr15', 0.0008115028643548081),\n",
       " ('local', 0.0008106643006468441),\n",
       " ('motorcycles', 0.0008101026017829218),\n",
       " ('three', 0.000807782929052337),\n",
       " ('end', 0.0008073900664202822),\n",
       " ('arizona', 0.0008050593329483449),\n",
       " ('following', 0.0008049369419772443),\n",
       " ('though', 0.0008032011966078339),\n",
       " ('nis', 0.0008026198459905306),\n",
       " ('opinions', 0.0008006662131902651),\n",
       " ('heard', 0.0008006662131902651),\n",
       " ('general', 0.0007992998137630065),\n",
       " ('men', 0.0007956766235911279),\n",
       " ('jpl', 0.0007937191660753354),\n",
       " ('service', 0.0007934320935336864),\n",
       " ('canada', 0.0007931902159856238),\n",
       " ('says', 0.0007910267523516077),\n",
       " ('evidence', 0.0007880877101141906),\n",
       " ('house', 0.0007869973048730247),\n",
       " ('center', 0.0007869606288663084),\n",
       " ('rights', 0.0007864745190522061),\n",
       " ('found', 0.0007852186632279702),\n",
       " ('cleveland', 0.000785013846870916),\n",
       " ('times', 0.0007838746811111751),\n",
       " ('study', 0.0007836519851880604),\n",
       " ('isn', 0.0007832064219757052),\n",
       " ('several', 0.000781787193292776),\n",
       " ('above', 0.000781787193292776),\n",
       " ('women', 0.0007751993751352929),\n",
       " ('1993apr19', 0.0007690645012909346),\n",
       " ('uwm', 0.000766602034010288),\n",
       " ('standard', 0.0007657420490290582),\n",
       " ('privacy', 0.0007647615755129245),\n",
       " ('cable', 0.0007635614291881209),\n",
       " ('greek', 0.0007635614291881209),\n",
       " ('thu', 0.0007628040011303842),\n",
       " ('1993apr21', 0.0007625350627302666),\n",
       " ('april', 0.0007615973669170749),\n",
       " ('open', 0.0007603424076976901),\n",
       " ('pit', 0.0007601110850012237),\n",
       " ('money', 0.0007600729336801353),\n",
       " ('insurance', 0.0007599219745553176),\n",
       " ('user', 0.0007596161126368256),\n",
       " ('source', 0.0007580414593823635),\n",
       " ('simply', 0.000756528275229794),\n",
       " ('oracle', 0.0007556875555687636),\n",
       " ('rather', 0.0007534615703473855),\n",
       " ('police', 0.000752691456897756),\n",
       " ('yes', 0.0007513168589534515),\n",
       " ('getting', 0.0007498042747185288),\n",
       " ('man', 0.0007495762946024022),\n",
       " ('else', 0.0007489347010201099),\n",
       " ('cars', 0.0007481526203553466),\n",
       " ('nor', 0.0007479175860729734),\n",
       " ('means', 0.0007447229364746283),\n",
       " ('disk', 0.0007407856092252286),\n",
       " ('armenians', 0.0007398326835530155),\n",
       " ('god', 0.0007396281733613525),\n",
       " ('always', 0.0007395640406391227),\n",
       " ('home', 0.0007385322824745356),\n",
       " ('pay', 0.0007377460485042115),\n",
       " ('history', 0.0007370132755390427),\n",
       " ('life', 0.0007351123670678957),\n",
       " ('bad', 0.000734915863244169),\n",
       " ('current', 0.0007332598700610643),\n",
       " ('american', 0.0007321830355080975),\n",
       " ('medical', 0.0007293428307981842),\n",
       " ('buy', 0.0007292875698118136),\n",
       " ('machine', 0.0007285052035489176),\n",
       " ('wrong', 0.0007276682127385681),\n",
       " ('1993apr23', 0.0007272135638155514),\n",
       " ('left', 0.0007268033965571007),\n",
       " ('maybe', 0.000726512408614636),\n",
       " ('whether', 0.0007264251958765922),\n",
       " ('entries', 0.0007252862361044709),\n",
       " ('claim', 0.000722750109928419),\n",
       " ('keep', 0.0007226274595557839),\n",
       " ('1993apr22', 0.0007203716525776207),\n",
       " ('keys', 0.0007200218946371798),\n",
       " ('ago', 0.0007198500362372175),\n",
       " ('upenn', 0.0007198490272510901),\n",
       " ('austin', 0.0007198490272510901),\n",
       " ('text', 0.000719209329344882),\n",
       " ('message', 0.0007176800590402508),\n",
       " ('quite', 0.0007176800590402508),\n",
       " ('today', 0.0007163733341612844),\n",
       " ('seen', 0.0007159253368522761),\n",
       " ('scsi', 0.0007156364302373847),\n",
       " ('food', 0.0007154299698579752),\n",
       " ('mike', 0.0007152373496139287),\n",
       " ('others', 0.0007149804656207527),\n",
       " ('mean', 0.0007149804656207527),\n",
       " ('low', 0.0007118969035695221),\n",
       " ('sdd', 0.000710916970009732),\n",
       " ('trying', 0.0007104266182051468),\n",
       " ('call', 0.0007099057247328091),\n",
       " ('free', 0.0007098255829450992),\n",
       " ('started', 0.0007086987061144767),\n",
       " ('email', 0.0007056701333518544),\n",
       " ('box', 0.0007053210031919516),\n",
       " ('white', 0.0007053081404688079),\n",
       " ('eff', 0.0007050214384989052),\n",
       " ('area', 0.0007040220697791933),\n",
       " ('drug', 0.0007031083521987392),\n",
       " ('mouse', 0.0007024303239945861),\n",
       " ('idea', 0.0007020373502841386),\n",
       " ('armenian', 0.0007017311175991545),\n",
       " ('example', 0.0007011725477739554),\n",
       " ('magnus', 0.0006994437271860387),\n",
       " ('nothing', 0.0006993262022956914),\n",
       " ('video', 0.0006989953901516688),\n",
       " ('side', 0.0006965463999206074),\n",
       " ('during', 0.0006965463999206074),\n",
       " ('already', 0.0006959954273358122),\n",
       " ('manager', 0.0006944086896795297),\n",
       " ('anti', 0.0006931591702354119),\n",
       " ('press', 0.0006926923797644463),\n",
       " ('care', 0.0006923034108107279),\n",
       " ('given', 0.000690763558711294),\n",
       " ('israeli', 0.0006900779863003352),\n",
       " ('cup', 0.000687950698661064),\n",
       " ('doing', 0.000687810749294054),\n",
       " ('big', 0.0006870169429900241),\n",
       " ('perhaps', 0.0006867457949061245),\n",
       " ('wed', 0.0006853983544386849),\n",
       " ('seem', 0.0006845856662319463),\n",
       " ('california', 0.000683370833140891),\n",
       " ('nthanks', 0.0006831849641486327),\n",
       " ('away', 0.0006831087703613271),\n",
       " ('haven', 0.0006826755784315082),\n",
       " ('fan', 0.0006818575461368786),\n",
       " ('tin', 0.000680668398491026),\n",
       " ('questions', 0.0006799789395378035),\n",
       " ('nbut', 0.0006798149506893704),\n",
       " ('once', 0.0006798149506893704),\n",
       " ('full', 0.0006793195410150111),\n",
       " ('large', 0.0006793195410150111),\n",
       " ('looking', 0.0006769551629595981),\n",
       " ('small', 0.0006765700022380272),\n",
       " ('issue', 0.0006761621513320072),\n",
       " ('price', 0.0006758031206148527),\n",
       " ('cornell', 0.0006753119613288277),\n",
       " ('station', 0.0006752213692862529),\n",
       " ('yet', 0.0006751334126422132),\n",
       " ('remember', 0.0006751334126422132),\n",
       " ('cramer', 0.0006734374427213347),\n",
       " ('uchicago', 0.0006729143707207739),\n",
       " ('watson', 0.0006728095895442688),\n",
       " ('build', 0.000671701701589766),\n",
       " ('often', 0.000671661492951978),\n",
       " ('disease', 0.0006710184629477231),\n",
       " ('later', 0.0006701322915301675),\n",
       " ('private', 0.0006685836850136572),\n",
       " ('media', 0.0006685642615650023),\n",
       " ('copy', 0.0006685100797689148),\n",
       " ('usually', 0.0006682029318206137),\n",
       " ('human', 0.0006675202981532188),\n",
       " ('device', 0.0006666714626163519),\n",
       " ('circuit', 0.0006661635010245164),\n",
       " ('neutral', 0.0006645446895672757),\n",
       " ('company', 0.0006642687294700631),\n",
       " ('thanks', 0.0006637986733020484),\n",
       " ('fax', 0.0006636887118039159),\n",
       " ('earth', 0.000663345872590878),\n",
       " ('cause', 0.0006631590330371609),\n",
       " ('kind', 0.000662578647266036),\n",
       " ('uni', 0.000662065257104348),\n",
       " ('nkeywords', 0.0006618840695706192),\n",
       " ('wanted', 0.0006607926444418744),\n",
       " ('change', 0.0006600132613349167),\n",
       " ('school', 0.0006587432010943698),\n",
       " ('days', 0.0006586662260188821),\n",
       " ('mission', 0.0006585002706928409),\n",
       " ('society', 0.0006582415704253876),\n",
       " ('book', 0.000658092573902443),\n",
       " ('clock', 0.0006572813892969234),\n",
       " ('thought', 0.0006560044289664794),\n",
       " ('mbytes', 0.0006557261341445137),\n",
       " ('speed', 0.0006548532868839329),\n",
       " ('bogus', 0.0006532703065743716),\n",
       " ('monitor', 0.0006525198420149443),\n",
       " ('1993apr16', 0.0006510977534211737),\n",
       " ('nfollowup', 0.0006508412153202509),\n",
       " ('wupost', 0.0006466668688027746),\n",
       " ('provide', 0.0006465852610402082),\n",
       " ('george', 0.0006457722981025589),\n",
       " ('henry', 0.0006441777244298143),\n",
       " ('saying', 0.0006440041579203492),\n",
       " ('cost', 0.0006423908225300545),\n",
       " ('demon', 0.000642328037953565),\n",
       " ('rlk', 0.0006417745142690985),\n",
       " ('groups', 0.0006415978556156525),\n",
       " ('hand', 0.0006406601560695854),\n",
       " ('san', 0.0006405816320105697),\n",
       " ('department', 0.0006391757298269934),\n",
       " ('air', 0.0006390296427916062),\n",
       " ('instead', 0.0006386958914512465),\n",
       " ('frank', 0.0006382279218477666),\n",
       " ('whole', 0.0006376839265080926),\n",
       " ('type', 0.0006371527964474683),\n",
       " ('guess', 0.0006365412988729022),\n",
       " ('nas', 0.0006365298162406535),\n",
       " ('color', 0.0006361456519893354),\n",
       " ('feed', 0.000634256864663571),\n",
       " ('steve', 0.0006333555861393872),\n",
       " ('wrote', 0.0006332417412645504),\n",
       " ('until', 0.0006331457701116444),\n",
       " ('subject', 0.0006324949604662829),\n",
       " ('netnews', 0.0006323295834817932),\n",
       " ('apps', 0.0006318512068745483),\n",
       " ('sort', 0.0006318412931512168),\n",
       " ('bitnet', 0.0006293078750220739),\n",
       " ('games', 0.0006293078750220739),\n",
       " ('iastate', 0.0006286958211959376),\n",
       " ('makes', 0.00062829754615327),\n",
       " ('b8f', 0.0006278228943936833),\n",
       " ('needed', 0.0006262081620984742),\n",
       " ('york', 0.0006258392236134522),\n",
       " ('body', 0.0006258392236134522),\n",
       " ('children', 0.0006224060957122036),\n",
       " ('100', 0.0006213473388412584),\n",
       " ('ftp', 0.0006201595001082343),\n",
       " ('bnr', 0.0006190740066430268),\n",
       " ('mcgill', 0.0006181386851152359),\n",
       " ('sex', 0.0006173358221199856),\n",
       " ('rules', 0.0006163811571393132),\n",
       " ('uci', 0.0006163811571393132),\n",
       " ('atheism', 0.000615932185823235),\n",
       " ('ncsu', 0.00061514019615476),\n",
       " ('jews', 0.0006150187016692578),\n",
       " ('colostate', 0.0006149924028533139),\n",
       " ('memory', 0.000614841027320417),\n",
       " ('test', 0.000614764454122653),\n",
       " ('engineering', 0.00061138758691351),\n",
       " ('std', 0.0006109145722113717),\n",
       " ('level', 0.0006106013572290709),\n",
       " ('major', 0.0006090359411447516),\n",
       " ('black', 0.0006076076034695886),\n",
       " ('size', 0.0006076076034695886),\n",
       " ('internet', 0.0006064893526046007),\n",
       " ('killed', 0.0006055157801552383),\n",
       " ('nthere', 0.0006053543298971603),\n",
       " ('institute', 0.0006046450007103526),\n",
       " ('told', 0.0006041546913443724),\n",
       " ('board', 0.0006030548464053613),\n",
       " ('note', 0.0006027572201020222),\n",
       " ('communications', 0.0006023577897728507),\n",
       " ('escrow', 0.0006008735335301194),\n",
       " ('batf', 0.0006005338811776541),\n",
       " ('pretty', 0.0005989378477348928),\n",
       " ('running', 0.0005977951682775895),\n",
       " ('nwith', 0.0005966117292462604),\n",
       " ('sandvik', 0.0005959170650249238),\n",
       " ('cannot', 0.0005952305599321553),\n",
       " ('including', 0.000594335947016757),\n",
       " ('protect', 0.0005942939643584582),\n",
       " ('tax', 0.0005942939643584582),\n",
       " ('transfer', 0.0005941149095434671),\n",
       " ('agree', 0.0005939844174993512),\n",
       " ('sometimes', 0.0005935799217341816),\n",
       " ('cancer', 0.0005934157359134862),\n",
       " ('caen', 0.0005933075774010202),\n",
       " ('sources', 0.0005928347288548636),\n",
       " ('macintosh', 0.0005924350637397138),\n",
       " ('unless', 0.0005916418824155063),\n",
       " ('nare', 0.0005915880779161036),\n",
       " ('members', 0.0005908979661627243),\n",
       " ('koresh', 0.0005896400850477818),\n",
       " ('word', 0.0005883536428120184),\n",
       " ('certain', 0.0005877389888910453),\n",
       " ('cubs', 0.000586087267759442),\n",
       " ('p45', 0.0005859680347674378),\n",
       " ('none', 0.0005856721971369426),\n",
       " ('brian', 0.0005854899563072886),\n",
       " ('mot', 0.0005853335739491918),\n",
       " ('show', 0.0005825660799335989),\n",
       " ('screen', 0.0005817782172471448),\n",
       " ('environment', 0.0005817782172471448),\n",
       " ('math', 0.0005816678059618818),\n",
       " ('robert', 0.0005816678059618818),\n",
       " ('ucsd', 0.0005816426783968111),\n",
       " ('night', 0.0005812736185108942),\n",
       " ('themselves', 0.00057863876412066),\n",
       " ('players', 0.0005777812315615548),\n",
       " ('known', 0.0005777648581144606),\n",
       " ('tel', 0.0005775532893061072),\n",
       " ('nwhat', 0.0005773526594343524),\n",
       " ('wisc', 0.0005768289669683737),\n",
       " ('sgi', 0.0005761877368562358),\n",
       " ('unm', 0.0005761877368562358),\n",
       " ('programmer', 0.0005760827561817055),\n",
       " ('similar', 0.00057604076205075),\n",
       " ('pacific', 0.000576005708834044),\n",
       " ('james', 0.0005738405091250958),\n",
       " ('special', 0.0005737452034412585),\n",
       " ('driver', 0.0005736475376001167),\n",
       " ('gfci', 0.0005720164148920226),\n",
       " ('paul', 0.0005717018866330891),\n",
       " ('pat', 0.0005707228240395709),\n",
       " ('single', 0.0005699015999350424),\n",
       " ('weapons', 0.0005697990865610814),\n",
       " ('apollo', 0.0005697723732317468),\n",
       " ('effect', 0.0005670894443118594),\n",
       " ('americans', 0.0005670459548874152),\n",
       " ('feel', 0.0005670350159101417),\n",
       " ('period', 0.0005660544816458705),\n",
       " ('bike', 0.0005658955438487638),\n",
       " ('matter', 0.0005635693599357641),\n",
       " ('administration', 0.0005622527546997779),\n",
       " ('newsreader', 0.0005620086740202984),\n",
       " ('graphics', 0.0005608126142537563),\n",
       " ('working', 0.0005602915134565296),\n",
       " ('hot', 0.0005589452983890236),\n",
       " ('drivers', 0.0005581435500974109),\n",
       " ('sol', 0.0005581209236510121),\n",
       " ('24e', 0.0005580647950166075),\n",
       " ('3dy', 0.0005580647950166075),\n",
       " ('phone', 0.000557925937557593),\n",
       " ('address', 0.0005578220951236888),\n",
       " ('radar', 0.0005574708027652513),\n",
       " ('position', 0.0005567961196829777),\n",
       " ('live', 0.0005563256810309313),\n",
       " ('reply', 0.0005558068971941),\n",
       " ('important', 0.0005551489614991686),\n",
       " ('came', 0.0005551489614991686),\n",
       " ('ask', 0.0005548671903444773),\n",
       " ('theory', 0.0005546044442527859),\n",
       " ('sat', 0.0005526108469690351),\n",
       " ('dept', 0.0005526108469690351),\n",
       " ('wouldn', 0.0005518828576617567),\n",
       " ('answer', 0.0005517057406205408),\n",
       " ('likely', 0.0005516781302903623),\n",
       " ('truth', 0.0005515460064238761),\n",
       " ('turn', 0.000551037963019845),\n",
       " ('ram', 0.0005506283343693056),\n",
       " ('everyone', 0.0005501512895990309),\n",
       " ('hit', 0.0005486909685796725),\n",
       " ('stop', 0.000548238621711199),\n",
       " ('mil', 0.0005477723296349563),\n",
       " ('states', 0.0005476093206617706),\n",
       " ('programs', 0.0005470470132484134),\n",
       " ('check', 0.0005468362498721501),\n",
       " ('bus', 0.0005464345485863988),\n",
       " ('option', 0.0005464146489335869),\n",
       " ('within', 0.0005461309683835443),\n",
       " ('1993apr26', 0.0005454717810780521),\n",
       " ('requests', 0.0005441734576987419),\n",
       " ('s0t', 0.0005441131751411923),\n",
       " ('outlets', 0.0005441131751411923),\n",
       " ('works', 0.0005439646770783532),\n",
       " ('posted', 0.0005436789214861011),\n",
       " ('force', 0.0005435189644794916),\n",
       " ('services', 0.0005429092383429538),\n",
       " ('smith', 0.000542639562594705),\n",
       " ('alaska', 0.0005424264825712172),\n",
       " ('happy', 0.0005416564689824972),\n",
       " ('process', 0.0005415782095842419),\n",
       " ('federal', 0.0005415782095842419),\n",
       " ('corporation', 0.0005415527848610624),\n",
       " ('making', 0.0005405165188639462),\n",
       " ('pin', 0.0005396043884844113),\n",
       " ('sound', 0.0005389406625953305),\n",
       " ('war', 0.0005386283407494502),\n",
       " ('display', 0.0005384955960504915),\n",
       " ('motif', 0.0005382476716354151),\n",
       " ('nnot', 0.0005382403999386511),\n",
       " ('stuff', 0.0005380180183535722),\n",
       " ('nthey', 0.0005372065533731713),\n",
       " ('nso', 0.0005369342260392716),\n",
       " ('des', 0.0005368147703581785),\n",
       " ('convex', 0.000536667771212626),\n",
       " ('laws', 0.0005353331028590472),\n",
       " ('russian', 0.0005338470462358855),\n",
       " ('application', 0.0005333365214414352),\n",
       " ('advice', 0.0005333365214414352),\n",
       " ('strong', 0.0005333365214414352),\n",
       " ('147', 0.0005328365106633514),\n",
       " ('umn', 0.0005322149037152127),\n",
       " ('corp', 0.0005317638470918981),\n",
       " ('ecn', 0.0005304585513914551),\n",
       " ('_lw', 0.0005301615552657771),\n",
       " ('fine', 0.0005296459544197284),\n",
       " ('light', 0.0005289563136988289),\n",
       " ('specific', 0.000528374704624655),\n",
       " ('easy', 0.0005276729906046911),\n",
       " ('anybody', 0.0005274125675892694),\n",
       " ('taking', 0.0005273331891355573),\n",
       " ('hear', 0.0005266262909412747),\n",
       " ('future', 0.0005250165892876251),\n",
       " ('original', 0.0005250165892876251),\n",
       " ('mind', 0.0005248956276620216),\n",
       " ('country', 0.0005247669543670047),\n",
       " ('jesus', 0.0005245523436101796),\n",
       " ('nhave', 0.0005237477358132025),\n",
       " ('went', 0.0005234860648688527),\n",
       " ('difference', 0.0005234860648688527),\n",
       " ('lines', 0.0005233869441216257),\n",
       " ('tape', 0.000522552370864156),\n",
       " ('ubc', 0.0005224927262334099),\n",
       " ('short', 0.0005205529701909714),\n",
       " ('front', 0.0005198319906192044),\n",
       " ('common', 0.0005191620832247955),\n",
       " ('ncr', 0.0005190245405055788),\n",
       " ('tools', 0.0005189609266228789),\n",
       " ('acns', 0.000518883274842048),\n",
       " ('league', 0.0005184420158970653),\n",
       " ('couple', 0.0005183344323162462),\n",
       " ('msdos', 0.0005181619859983976),\n",
       " ('adobe', 0.0005181619859983976),\n",
       " ('include', 0.0005178307171283151),\n",
       " ('talking', 0.000517789449034382),\n",
       " ('comments', 0.0005165668491892031),\n",
       " ('g9v', 0.0005162099353903619),\n",
       " ('expect', 0.0005156189238823266),\n",
       " ('certainly', 0.0005156189238823266),\n",
       " ('section', 0.0005155083376858661),\n",
       " ('mchp', 0.0005155083376858661),\n",
       " ('sni', 0.0005155083376858661),\n",
       " ('armenia', 0.0005155083376858661),\n",
       " ('guy', 0.000514809437460516),\n",
       " ('devices', 0.0005145740066267771),\n",
       " ('image', 0.0005144543765990498),\n",
       " ('utk', 0.000514293637791688),\n",
       " ('turkey', 0.000514293637791688),\n",
       " ('policy', 0.0005142793843682089),\n",
       " ('electrical', 0.0005135399144974327),\n",
       " ('everything', 0.0005132146424377123),\n",
       " ('narticle', 0.000512577471856781),\n",
       " ('duke', 0.0005121918578443953),\n",
       " ('algorithm', 0.0005121264086386532),\n",
       " ('consider', 0.0005113797947606738),\n",
       " ('nmy', 0.000510582897868593),\n",
       " ('rush', 0.0005103413593986737),\n",
       " ('gary', 0.0005103309462858547),\n",
       " ('physics', 0.0005100155087013357),\n",
       " ('vms', 0.0005100155087013357),\n",
       " ('early', 0.0005097101157933566),\n",
       " ('college', 0.0005094308117286489),\n",
       " ('msg', 0.0005092862828639645),\n",
       " ('stafford', 0.0005086795154138553),\n",
       " ('itself', 0.0005081236006103795),\n",
       " ('third', 0.0005080170229035439),\n",
       " ('cases', 0.0005074035094016561),\n",
       " ('fans', 0.0005074035094016561),\n",
       " ('interested', 0.0005071216134990054),\n",
       " ('mode', 0.0005068260599570754),\n",
       " ('constitution', 0.0005064643317615527),\n",
       " ('saw', 0.0005063298609927316),\n",
       " ('add', 0.000505426651188714),\n",
       " ('anyway', 0.0005052120416246957),\n",
       " ('386', 0.0005046857780405804),\n",
       " ('nno', 0.0005033165207451974),\n",
       " ('understand', 0.0005033165207451974),\n",
       " ('needs', 0.0005028221412556085),\n",
       " ('diet', 0.0005027068234398695),\n",
       " ('nice', 0.0005024023222758464),\n",
       " ('peachnet', 0.0005022754097820482),\n",
       " ('6ql', 0.0005022583155149466),\n",
       " ('carleton', 0.0005022467431214974),\n",
       " ('enforcement', 0.0005022202515705279),\n",
       " ('port', 0.0005020998019909217),\n",
       " ('sfu', 0.000499656200784169),\n",
       " ('portal', 0.000499656200784169),\n",
       " ('plus', 0.0004994494713994455),\n",
       " ('arab', 0.0004986985053513272),\n",
       " ('almost', 0.0004983723446955927),\n",
       " ('water', 0.000498171072208486),\n",
       " ('situation', 0.0004967793834674312),\n",
       " ('write', 0.0004952655453080242),\n",
       " ('bbs', 0.0004950150525630611),\n",
       " ('built', 0.0004941565562186428),\n",
       " ('1993apr17', 0.0004941565562186428),\n",
       " ('goal', 0.0004941400464018904),\n",
       " ('court', 0.0004941400464018904),\n",
       " ('top', 0.000492393907888721),\n",
       " ('linus', 0.0004920802061616637),\n",
       " ('emory', 0.0004918118982636169),\n",
       " ('thus', 0.0004915793456766943),\n",
       " ('head', 0.0004907148457878205),\n",
       " ('conspiracy', 0.0004903393276959693),\n",
       " ('ericsson', 0.0004901757117202791),\n",
       " ('goes', 0.0004890461921801125),\n",
       " ('6umu', 0.0004883066956395315),\n",
       " ('magi', 0.0004883066956395315),\n",
       " ('date', 0.00048802296141070507),\n",
       " ('okstate', 0.00048701835796957974),\n",
       " ('ksu', 0.0004869775051268758),\n",
       " ('unit', 0.0004866487904089221),\n",
       " ('cards', 0.0004866487904089221),\n",
       " ('cut', 0.000486306441902703),\n",
       " ('except', 0.000485739936160495),\n",
       " ('ogicse', 0.00048550879155141757),\n",
       " ('hope', 0.00048542760846973307),\n",
       " ('reference', 0.0004850465963357975),\n",
       " ('equipment', 0.0004850465963357975),\n",
       " ('western', 0.0004846393980055052),\n",
       " ('simple', 0.00048410620570677127),\n",
       " ('business', 0.00048398488589081033),\n",
       " ('job', 0.0004836030364285766),\n",
       " ('political', 0.000483085444228256),\n",
       " ('ones', 0.0004827855980176444),\n",
       " ('individual', 0.00048230097806160915),\n",
       " ('msus', 0.0004821568733966375),\n",
       " ('vga', 0.0004805782782459063),\n",
       " ('1993apr6', 0.00048018890206320277),\n",
       " ('vax', 0.0004795739358388626),\n",
       " ('value', 0.0004795739358388626),\n",
       " ('mine', 0.00047932560173978594),\n",
       " ('asked', 0.00047915487658060446),\n",
       " ('especially', 0.000479085964432269),\n",
       " ('parts', 0.00047863123271130837),\n",
       " ('season', 0.00047863123271130837),\n",
       " ('considered', 0.00047856676070623703),\n",
       " ('via', 0.00047834597260791963),\n",
       " ('gmt', 0.0004781411563575598),\n",
       " ('font', 0.0004781054900355585),\n",
       " ('experience', 0.00047728780844329556),\n",
       " ('astro', 0.0004770909534915898),\n",
       " ('numbers', 0.00047701276231254145),\n",
       " ('longer', 0.00047578870529275095),\n",
       " ('account', 0.0004749121667337218),\n",
       " ('nvendor', 0.0004743550757641163),\n",
       " ('nethernet', 0.0004743550757641163),\n",
       " ('applications', 0.000474262628427249),\n",
       " ('required', 0.0004741521768180132),\n",
       " ('ux1', 0.0004738773683263368),\n",
       " ('blue', 0.0004738773683263368),\n",
       " ('package', 0.0004735630860956414),\n",
       " ('sense', 0.0004732767017305311),\n",
       " ('self', 0.0004729133126380028),\n",
       " ('orbit', 0.00047290375507453436),\n",
       " ('bob', 0.0004728716188325287),\n",
       " ('lead', 0.0004725143683556493),\n",
       " ('taken', 0.0004724319927040431),\n",
       " ('uses', 0.00047200331125850587),\n",
       " ('berlin', 0.00047028830806429886),\n",
       " ('field', 0.00046964829169021666),\n",
       " ('buffalo', 0.00046926923221317124),\n",
       " ('miles', 0.00046872586553368256),\n",
       " ('brown', 0.00046857147217923997),\n",
       " ('interesting', 0.00046812770720895436),\n",
       " ('results', 0.00046798902409612297),\n",
       " ('_____', 0.00046782074506610303),\n",
       " ('red', 0.0004674860640741734),\n",
       " ('civil', 0.0004672241201026323),\n",
       " ('previous', 0.00046680457178415266),\n",
       " ('ucsu', 0.00046643769174076227),\n",
       " ('nnetwork', 0.0004663471505735268),\n",
       " ('clarkson', 0.0004663471505735268),\n",
       " ('events', 0.0004662242787928889),\n",
       " ('christian', 0.0004656377232128027),\n",
       " ('objective', 0.000465398205300889),\n",
       " ('term', 0.00046501414028651193),\n",
       " ('input', 0.0004645613888723601),\n",
       " ('doctor', 0.0004645613888723601),\n",
       " ('trade', 0.00046439676222824965),\n",
       " ('faq', 0.00046430550498227006),\n",
       " ('bmw', 0.00046430550498227006),\n",
       " ('elroy', 0.0004639377402901159),\n",
       " ('ctr', 0.00046365101391634424),\n",
       " ('response', 0.0004629391264530198),\n",
       " ('ncpu', 0.0004626825988942314),\n",
       " ('gets', 0.00046134324768418687),\n",
       " ('page', 0.0004609440805162559),\n",
       " ('shot', 0.0004605662623799647),\n",
       " ('csd', 0.00046036856393965063),\n",
       " ('agencies', 0.00046005199086689017),\n",
       " ('swrinde', 0.0004595381360180141),\n",
       " ('neither', 0.0004593567987384566),\n",
       " ('psinntp', 0.0004592620045745692),\n",
       " ('project', 0.0004590724073000766),\n",
       " ('nmouse', 0.00045868772370763494),\n",
       " ('noperating', 0.00045868772370763494),\n",
       " ('books', 0.0004581859291585287),\n",
       " ('cts', 0.00045807447954389254),\n",
       " ('fast', 0.0004577001806143087),\n",
       " ('chem', 0.0004572918546478061),\n",
       " ...]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "last_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(49057, 2)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mat(last_result).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
